library(tidyverse)
# remotes::install_github("nickbrazeau/discent", ref = "develop")
library(discent)
source("R/utils.R")
Model conceptualization: \[ r_{ij} = \left(\frac{f_i + f_j}{2}\right) exp\left\{ \frac{-d_{ij}}{M} \right\} \]
Coding Decisions: 1. M and F parameters have different learning rates (uncoupled) 2. M must be positive 2. M is normalized as part of input 3. ADAM gradient descent optimizer
Using polySimIBD have five frameworks of simulation,
which for each we ran 100 independent realizations (i.e. same migration,
same Ne = different seed). For each framework, there are 25 demes laid
out on a 5x5 square plane. Frameworks are as follows:
fstartsvec <- c(0.1, 0.3, 0.5)
mstartsvec <- c(10, 25, 50, 100, 250) # standardizing geodistance
flearnsvec <- mlearnsvec <- c(1e-2, 1e-3, 1e-4)
search_grid <- expand.grid(fstartsvec, mstartsvec, flearnsvec, mlearnsvec)
colnames(search_grid) <- c("fstart", "mstart", "f_learn", "m_learn")
There are three major questions regarding our evaluation of
DISCent:
#............................................................
# Part 0: Read in pieces and results
#...........................................................
# read in model framework
migmatdf <- readRDS("simdata/00-simulation_setup/inputs/migmat_framework.RDS")
coordgrid <- readRDS("simdata/00-simulation_setup/inputs/squarecoords.rds")
# read in godisc results
discfn <- list.files("simdata/disc_results", pattern = ".RDS", full.names = T)
discrets <- lapply(discfn, function(x){
df <- readRDS(x)
nm <- stringr::str_replace(basename(x), ".RDS", "")
df <- df %>%
dplyr::mutate(mod = nm,
modbase = stringr::str_split_fixed(nm, "_", n = 3)[,1],
modbase = forcats::as_factor(modbase),
rep = stringr::str_split_fixed(nm, "_", n = 3)[,2],
rep = forcats::as_factor(rep)) %>%
dplyr::relocate(mod, modbase)
return(df)
}) %>%
dplyr::bind_rows()
#......................
# get cost, fs, and ms
#......................
discrets <- discrets %>%
dplyr::mutate(
final_cost = purrr::map_dbl(discret, extract_final_cost),
final_fs = purrr::map(discret, get_fs),
final_m = purrr::map_dbl(discret, get_ms)
)
Remember, we are ignoring reps here which represent different
realizations of the polySimIBD model with the same initial
start frameworks (e.g. lattice, etc.)
discrets %>%
ggplot() +
geom_boxplot(aes(x = modbase, y = final_cost)) +
theme_linedraw() +
ggtitle("Costs over All Model Frameworks")
discrets %>%
tidyr::unnest(cols = "final_fs") %>%
ggplot() +
geom_point(aes(x = final_cost, y = Final_Fis), alpha = 0.5) +
theme_linedraw() +
scale_color_viridis_c() +
facet_grid(rows = vars(modbase), scales = "free") +
ggtitle("Costs vs Final Fs") +
theme(legend.position = "none",
axis.text.x = element_text(angle = 90))
discrets %>%
tidyr::unnest(cols = "final_m") %>%
ggplot() +
geom_point(aes(x = final_cost, y = final_m), alpha = 0.5) +
theme_linedraw() +
scale_color_viridis_c() +
facet_grid(rows = vars(modbase), scales = "free") +
ggtitle("Costs vs Final Ms") +
theme(legend.position = "none",
axis.text.x = element_text(angle = 90))
discrets %>%
tidyr::unnest(cols = c("final_fs", "final_m")) %>%
ggplot() +
geom_point(aes(x = Final_Fis, y = final_m,
color = final_cost), alpha = 0.5) +
theme_linedraw() +
scale_color_viridis_c() +
facet_grid(rows = vars(modbase), scales = "free") +
ggtitle("Final Fs vs Final Ms w/r/t Cost") +
theme(axis.text.x = element_text(angle = 90))
Remember, here we are still looking at all start parameters, not just best by cost: there are 1351005 global M estimations (135 start param combinations, 100 reps, 5 different models) and 1351005 * 25 (25 demes) final Fi estimations.
discrets %>%
tidyr::unnest(cols = c("final_m")) %>%
ggplot() +
geom_point(aes(x = final_cost, y = final_m), alpha = 0.5) +
theme_linedraw() +
facet_grid(rows = vars(modbase), scales = "free") +
scale_color_viridis_c() +
ggtitle("Model Frameworks vs Final Ms") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
discrets %>%
tidyr::unnest(cols = "final_fs") %>%
dplyr::mutate(Deme = factor(Deme, levels = 1:25, ordered = T)) %>%
ggplot() +
geom_point(aes(x = Deme, y = Final_Fis, color = final_cost), alpha = 0.5) +
theme_linedraw() +
facet_grid(rows = vars(modbase), scales = "free") +
scale_color_viridis_c() +
ggtitle("Demes vs Final Fs") +
theme(axis.text.x = element_text(angle = 90))
polySimIBD
Realization (aka Rep)discrets %>%
ggplot() +
geom_boxplot(aes(x = rep, y = final_cost)) +
facet_grid(rows = vars(modbase), scales = "free") +
theme_linedraw() +
ggtitle("Costs over All Model Frameworks w/r/t Reps") +
xlab("polySimIBD Realization") +
theme(axis.text.x = element_blank())
^ Although the above look like a
geom_point plot, they are
actually boxplots, which emphasizes that costs varied considerably w/r/t
different polySimIBD realizations within the same framework
(i.e. same migration matrix).
discrets %>%
dplyr::mutate(final_fs_var = purrr::map_dbl(final_fs, function(x){var(unlist(x))})) %>%
ggplot() +
geom_boxplot(aes(x = final_fs_var, y = final_cost, color = rep),
alpha = 0.5) +
theme_linedraw() +
scale_color_viridis_d() +
facet_grid(rows = vars(modbase), scales = "free") +
ggtitle("Costs vs Variance in Final Fs \n over All Model Frameworks w/r/t Reps") +
theme(legend.position = "none",
axis.text.x = element_text(angle = 90))
discrets %>%
tidyr::unnest(cols = "final_m") %>%
ggplot() +
geom_point(aes(x = final_cost, y = final_m, color = rep), alpha = 0.5) +
theme_linedraw() +
scale_color_viridis_d() +
facet_grid(rows = vars(modbase), scales = "free") +
ggtitle("Costs vs Final Ms") +
theme(legend.position = "none",
axis.text.x = element_text(angle = 90))
Here we have subsetted to the 10 lowest costs by model w/r/t rep and can look at the variation in the F estimates and final M.
# subset to best 10 per mod and rep
discrets10best <- discrets %>%
dplyr::group_by(modbase, rep) %>%
dplyr::arrange(final_cost) %>%
dplyr::slice_min(final_cost, n = 10)
# Fs
discrets10best %>%
tidyr::unnest(cols = "final_fs") %>%
dplyr::mutate(Deme = factor(Deme, levels = 1:25, ordered = T)) %>%
dplyr::group_by(modbase, rep, Deme) %>%
dplyr::summarise(
varDemerepF = var(Final_Fis)
) %>%
ggplot() +
geom_point(aes(x = rep, y = Deme, color = varDemerepF),
alpha = 0.5) +
scale_color_viridis_c() +
facet_grid(rows = vars(modbase), scales = "free") +
ggtitle("Variance in Final Fs by Deme w/r/t Rep and polySimIBD Model") +
xlab("Rep") +
theme_linedraw() +
theme(axis.text.x = element_blank())
# Ms
discrets10best %>%
tidyr::unnest(cols = "final_m") %>%
ggplot() +
ggbeeswarm::geom_beeswarm(aes(x = rep, y = final_m, color = final_cost),
alpha = 0.5) +
scale_color_viridis_c() +
facet_grid(rows = vars(modbase), scales = "free") +
ggtitle("Final Ms by Deme w/r/t Rep and polySimIBD Model") +
xlab("Rep") +
theme_linedraw() +
theme(axis.text.x = element_blank())
Here we have subsetted to the best parameter estimates by model. We expect the “average” behavior of the simulation frameworks to match our popgen/demograph expectations (i.e. lattice, torus). As a result, expect some consistency in Fs and Ms and well as spatial patterns.
discretsbest <- discrets %>%
dplyr::group_by(modbase, rep) %>%
dplyr::arrange(final_cost) %>%
dplyr::slice_min(final_cost, n = 1) %>%
dplyr::ungroup(rep) # one bad IBD model has a duplicated final_cost
discretsbest %>%
tidyr::unnest(cols = "final_fs") %>%
dplyr::mutate(Deme = factor(Deme, levels = 1:25, ordered = T)) %>%
ggplot() +
geom_boxplot(aes(x = Deme, y = Final_Fis), alpha = 0.5) +
theme_linedraw() +
facet_grid(rows = vars(modbase), scales = "free") +
scale_color_viridis_c() +
ggtitle("Across Reps (best), Demes vs Final Fs") +
theme(axis.text.x = element_text(angle = 90))
^ Interesting variation in the lattice model.
discretsbest %>%
tidyr::unnest(cols = "final_m") %>%
ggplot() +
geom_boxplot(aes(x = modbase, y = final_m), alpha = 0.5) +
theme_linedraw() +
ggtitle("Across Reps (best), Final Ms") +
theme(axis.text.x = element_text(angle = 90))
^ Interesting to see that the model as no idea what to do w/ the Bad IBD
- that’s promising, as the distance has a good bit of error in it.
discretsbest %>%
tidyr::unnest(cols = "final_m") %>%
ggplot() +
geom_point(aes(x = final_cost, y = final_m), alpha = 0.5) +
facet_grid(rows = vars(modbase), scales = "free") +
scale_color_viridis_c() +
theme_linedraw() +
ggtitle("Across Reps (best), Final Ms") +
theme(axis.text.x = element_text(angle = 90))
Is there any consistency in our start parameters among across these reps for each “lowest” cost?
discstart <- discretsbest %>%
dplyr::mutate(fstart = purrr::map_dbl(start_params, function(x){
return(unique(x[names(x) != "m"]))
}),
mstart = purrr::map_dbl(start_params, function(x){
return(x[names(x) == "m"])
}))
discstart %>%
ggplot() +
geom_boxplot(aes(x = modbase, y = fstart)) +
theme_linedraw() +
ggtitle("Start F across Models for Best Rep Cost Results") +
theme(axis.text.x = element_text(angle = 90))
discstart %>%
ggplot() +
geom_boxplot(aes(x = modbase, y = mstart)) +
theme_linedraw() +
ggtitle("Start M across Models for Best Rep Cost Results") +
theme(axis.text.x = element_text(angle = 90))
# bring together w/ spatial data
coordgrid$deme <- as.character(coordgrid$deme )
discretsum <- discretsbest %>%
tidyr::unnest(cols = "final_fs") %>%
dplyr::group_by(modbase, Deme) %>%
dplyr::summarise(
n = n(),
meanFi = mean(Final_Fis),
varFi = var(Final_Fis),
minFi = min(Final_Fis),
maxFi = max(Final_Fis)
) %>%
dplyr::rename(deme = Deme) %>%
dplyr::left_join(., coordgrid, by = "deme")
# need to split up b/c have different DISC scales
plotmn <- function(discretsumitem, colnm, modnm) {
colnm <- enquo(colnm)
plotObj <- discretsumitem %>%
ggplot() +
geom_point(aes_string(x = "longnum", y = "latnum",
color = quo_name(colnm)), size = 5) +
geom_text(aes(x = longnum, y = latnum, label = deme),
color = "#ffffff", size = 3) +
scale_color_viridis_c(paste(quo_name(colnm), " DISCs")) +
ylim(c(-2, 24)) +
ggtitle(paste(modnm, " Model")) +
theme_linedraw() +
theme(axis.text = element_blank())
return(plotObj)
}
# split out and expand to match lvls needed to run
discretsumbymod <- split(discretsum, discretsum$modbase)
discretsumbymod <- append(discretsumbymod, discretsumbymod)
discretsumbymod <- append(discretsumbymod, discretsumbymod)
discretsumbymod <- discretsumbymod[order(names(discretsumbymod))]
modnms <- names(discretsumbymod)
lvls <- rep(c("meanFi", "varFi", "minFi", "maxFi"), 5)
plotObjs <- mapply(plotmn,
discretsumitem = discretsumbymod,
colnm = lvls,
modnm = modnms, SIMPLIFY = F)
cowplot::plot_grid(plotlist = plotObjs[1:4])
cowplot::plot_grid(plotlist = plotObjs[5:8])
cowplot::plot_grid(plotlist = plotObjs[9:12])
cowplot::plot_grid(plotlist = plotObjs[13:16])
cowplot::plot_grid(plotlist = plotObjs[17:20])
Plot of Final M distribution.
# one call instead of 4
discretsbest %>%
tidyr::unnest(cols = "final_m") %>%
ggplot() +
geom_boxplot(aes(x = modbase, y = final_m)) +
theme_linedraw() +
ggtitle("Final Ms over Model Frameworks")
discretsbest %>%
tidyr::unnest(cols = "final_m") %>%
ggplot() +
geom_point(aes(x = final_m, y = final_cost, color = modbase)) +
facet_grid(rows = vars(modbase), scales = "free") +
theme_linedraw() +
ggtitle("Final Ms vs Cost over Model Frameworks")
Versus M Summary Statistics
# table
discretsbest %>%
tidyr::unnest(cols = "final_m") %>%
dplyr::group_by(modbase) %>%
dplyr::summarise(
n = n(),
meanM = mean(final_m),
varM = var(final_m),
minM = min(final_m),
maxM = max(final_m)) %>%
dplyr::mutate_if(is.numeric, round, 2) %>%
DT::datatable(.,
rownames = F,
extensions='Buttons',
options = list(
searching = T,
pageLength = 5,
dom = 'Bfrtip',
buttons = c('csv')))
^ This was the DISCent call:
discent::deme_inbreeding_spcoef(discdat = discdat,
start_params = start_params,
f_learningrate = f_learn,
m_learningrate = m_learn,
m_lowerbound = 0,
m_upperbound = Inf,
b1 = 0.9,
b2 = 0.999,
e = 1e-8,
normalize_geodist = TRUE,
steps = 5e5,
thin = 50,
report_progress = FALSE,
return_verbose = FALSE)
As you can see, I didn’t bound the M … but part of the ADAM equation takes into account the steps (in calculating the bias for the first and second moment - i.e. in our code here, cpp lines 176-177 in src/main.cpp for DISCent):
# Correct bias in first and second moment
m1t_fi_hat[i] = m1t_fi[step][i] / (1-pow(b1, step));
v2t_fi_hat[i] = v2t_fi[step][i] / (1-pow(b2, step));
Because we normalized our geo-distances, they are on a scale [0,1], which means that the very high values of M essentially reduce the \(e^{\frac{-d}{M}}\) term to 1.
Given that reps here match between bad-IBD and good-IBD, can do a paired cost difference.
discretsbest_badIBD <- discretsbest %>%
dplyr::ungroup() %>%
dplyr::filter(modbase == "badIsoByDist") %>%
dplyr::select(c("rep", "final_cost")) %>%
magrittr::set_colnames(c("rep", "badIBD_final_cost"))
discretsbest_goodIBD <- discretsbest %>%
dplyr::ungroup() %>%
dplyr::filter(modbase == "IsoByDist") %>%
dplyr::select(c("rep", "final_cost")) %>%
magrittr::set_colnames(c("rep", "goodIBD_final_cost"))
comparebadgood <- dplyr::inner_join(discretsbest_badIBD, discretsbest_goodIBD, by = "rep") %>%
dplyr::mutate(
costdiff = badIBD_final_cost - goodIBD_final_cost
)
comparebadgood %>%
ggplot() +
geom_boxplot(aes(y = costdiff)) +
theme_linedraw() +
xlab("Cost Diff.") +
ggtitle("Final Costs of Bad IsoByDist (wrong Distances) vs Correct Distance IsoByDist", subtitle = "Eq is $Bad - Good$, so negative numbers indicate cost identified good as better model (minimizing cost)") +
theme(axis.text.x = element_blank())